In [1]:
import os
import sys

In [2]:
import pandas

In [3]:
sys.path.append(os.path.join(os.path.abspath('../..'), 'src'))

In [4]:
import utils

In [5]:
with open('/home/immersinn/Dropbox/Analytics/NCGA/PACDocs/NC Campaign Document Search By Type-2014.html') as f:
    html = f.read()

In [6]:
from bs4 import BeautifulSoup as bs

In [7]:
soup = bs(html, 'html.parser')

In [8]:
tables = soup.find_all('table')

In [9]:
len(tables)


Out[9]:
1

In [10]:
table = tables[0]

In [11]:
rows = table.find_all('tr')

In [12]:
len(rows)


Out[12]:
12590

In [13]:
header = rows[0]
col_names = [h.text for h in header.find_all('td')]
col_names


Out[13]:
['Committee Name',
 'Report Year',
 'Report Type',
 'Amend',
 'Received Date',
 'Start Date',
 'End Date',
 'Image',
 'Data']

In [14]:
sub_info = rows[1:100]

In [15]:
sub_info[0].find_all('td')


Out[15]:
[<td nowrap="nowrap">01ST CONG DIST DEC</td>,
 <td align="center" nowrap="nowrap" style="font-weight:normal;font-style:normal;text-decoration:none;">2014</td>,
 <td nowrap="nowrap">Fourth Quarter</td>,
 <td align="center" nowrap="nowrap" style="font-weight:normal;font-style:normal;text-decoration:none;">N</td>,
 <td align="center" nowrap="nowrap" style="font-weight:normal;font-style:normal;text-decoration:none;">01/06/2015</td>,
 <td align="center" nowrap="nowrap" style="font-weight:normal;font-style:normal;text-decoration:none;">10/19/2014</td>,
 <td align="center" nowrap="nowrap" style="font-weight:normal;font-style:normal;text-decoration:none;">12/31/2014</td>,
 <td align="center" nowrap="nowrap" style="font-weight:normal;font-style:normal;text-decoration:none;"><a href="http://cf.ncsbe.gov/CFOrgLkup/cf_report_image.aspx?DID=182000">IMAGE</a></td>,
 <td align="center" nowrap="nowrap" style="font-weight:normal;font-style:normal;text-decoration:none;"><a href="http://cf.ncsbe.gov/CFOrgLkup/cf_report_sections.aspx?RID=132598&amp;SID=STA-C3826N-C-001&amp;CN=01ST%20CONG%20DIST%20DEC&amp;RN=2014%20Fourth%20Quarter">DATA</a></td>]

In [16]:
for cn,col in zip(col_names, sub_info[3].find_all('td')):
    a = col.find('a')
    if a:
        try:
            print(cn + ' - ' + col.text + ': ' + col.find('a')['href'])
        except KeyError:
            print(cn + ' - ' + 'NO DATA LINK')
    else:
        print(cn + ' - ' + col.text)


Committee Name - 01ST CONG DIST DEC
Report Year - 2014
Report Type - Penalty Assessment Letter
Amend -  
Received Date - 10/30/2014
Start Date -  
End Date -  
Image - IMAGE: http://cf.ncsbe.gov/CFOrgLkup/cf_report_image.aspx?DID=179531
Data - NO DATA LINK

In [17]:
def process_table(table, limit=100):
    
    def process_row(row):
        data = {}
        for cn, col in zip(col_names, row.find_all('td')):
            a = col.find('a')
            if a:
                try:
                    data[cn] = a['href']
                except KeyError:
                    data[cn] = ''
            else:
                data[cn] = col.text.strip()
        return(data)

    rows = table.find_all('tr')
    col_names = [h.text for h in rows[0].find_all('td')]
    
    data = [process_row(row) for row in rows[1:limit]]
    df = pandas.DataFrame(data)
    df = df[col_names]
    
    return(df)

In [18]:
df = process_table(table, limit=100)

In [42]:
df.shape


Out[42]:
(99, 9)

In [20]:
df.head()


Out[20]:
Committee Name Report Year Report Type Amend Received Date Start Date End Date Image Data
0 01ST CONG DIST DEC 2014 Fourth Quarter N 01/06/2015 10/19/2014 12/31/2014 http://cf.ncsbe.gov/CFOrgLkup/cf_report_image.... http://cf.ncsbe.gov/CFOrgLkup/cf_report_sectio...
1 01ST CONG DIST DEC 2014 Third Quarter Y 01/06/2015 07/01/2014 10/18/2014 http://cf.ncsbe.gov/CFOrgLkup/cf_report_image.... http://cf.ncsbe.gov/CFOrgLkup/cf_report_sectio...
2 01ST CONG DIST DEC 2014 Paid Penalty Assessment or Forfeiture 11/07/2014 http://cf.ncsbe.gov/CFOrgLkup/cf_report_image....
3 01ST CONG DIST DEC 2014 Penalty Assessment Letter 10/30/2014 http://cf.ncsbe.gov/CFOrgLkup/cf_report_image....
4 01ST CONG DIST DEC 2014 Third Quarter N 10/27/2014 07/01/2014 10/18/2014 http://cf.ncsbe.gov/CFOrgLkup/cf_report_image....

In [21]:
df['Report Type'].unique()


Out[21]:
array(['Fourth Quarter', 'Third Quarter',
       'Paid Penalty Assessment or Forfeiture',
       'Penalty Assessment Letter', 'Second Quarter', 'First Quarter', '',
       'Certification of Treasurer',
       'Political Party Executive Committee Exempt Sales Plan',
       'Penalty Appeal', 'Miscellaneous Correspondence',
       'Year End Semi-Annual', 'Mid Year Semi-Annual',
       'Correspondence Returned Undeliverable or Unclaimed',
       'Non-Compliance Letter'], dtype=object)

Test Code


In [37]:
reload(process_NCCampaignDocument_data)


Out[37]:
<module 'process_NCCampaignDocument_data' from '/home/immersinn/gits/ncga/src/process_NCCampaignDocument_data.py'>

In [31]:
import process_NCCampaignDocument_data

In [65]:
df2015 = process_NCCampaignDocument_data.lae_nccd('2015')

In [66]:
df2015.shape


Out[66]:
(8988, 9)

In [67]:
df2015.head()


Out[67]:
committee report_year report_type amend rec_date start_date end_date image_link data_link
0 01ST CONG DIST DEC 2015 Year End Semi-Annual 0 2016-01-26 2015-07-01 2015-12-31 http://cf.ncsbe.gov/CFOrgLkup/cf_report_image....
1 01ST CONG DIST DEC 2015 Mid Year Semi-Annual 1 2015-08-07 2015-01-01 2015-06-30 http://cf.ncsbe.gov/CFOrgLkup/cf_report_image.... http://cf.ncsbe.gov/CFOrgLkup/cf_report_sectio...
2 01ST CONG DIST DEC 2015 Mid Year Semi-Annual 0 2015-07-29 2015-01-01 2015-06-30 http://cf.ncsbe.gov/CFOrgLkup/cf_report_image....
3 01ST CONG DIST REC 2015 Year End Semi-Annual 0 2016-01-25 2015-07-01 2015-12-31 http://cf.ncsbe.gov/CFOrgLkup/cf_report_image....
4 01ST CONG DIST REC 2015 Mid Year Semi-Annual 0 2015-07-16 2015-01-01 2015-06-30 http://cf.ncsbe.gov/CFOrgLkup/cf_report_image.... http://cf.ncsbe.gov/CFOrgLkup/cf_report_sectio...

In [68]:
df2015.tail()


Out[68]:
committee report_year report_type amend rec_date start_date end_date image_link data_link
8983 ZENECA INC PAC 2015 Year End Semi-Annual 0 2016-01-29 2015-07-01 2015-12-31 http://cf.ncsbe.gov/CFOrgLkup/cf_report_image....
8984 ZENECA INC PAC 2015 Mid Year Semi-Annual 0 2015-07-30 2015-01-01 2015-06-30 http://cf.ncsbe.gov/CFOrgLkup/cf_report_image....
8985 ZUMWALT A VOICE FOR MOORE COUNTY 2015 Year End Semi-Annual 0 2016-02-01 2015-12-23 2015-12-31 http://cf.ncsbe.gov/CFOrgLkup/cf_report_image.... http://cf.ncsbe.gov/CFOrgLkup/cf_report_sectio...
8986 ZUMWALT A VOICE FOR MOORE COUNTY 2015 0 2015-12-28 http://cf.ncsbe.gov/CFOrgLkup/cf_report_image....
8987 ZUMWALT A VOICE FOR MOORE COUNTY 2015 Certification of Treasurer 0 2015-12-28 http://cf.ncsbe.gov/CFOrgLkup/cf_report_image....

In [70]:
df2015[df2015.data_link=='MISSING DATA']


Out[70]:
committee report_year report_type amend rec_date start_date end_date image_link data_link

In [46]:
df2015[''].apply(len).describe()


Out[46]:
count    6725.000000
mean       31.903792
std        62.597587
min         0.000000
25%         0.000000
50%         0.000000
75%         0.000000
max       241.000000
Name: Data, dtype: float64

In [50]:
df2015['Received Date'][0].split('/')


Out[50]:
['01', '26', '2016']

In [51]:
df2014.columns


Out[51]:
Index(['Committee Name', 'Report Year', 'Report Type', 'Amend',
       'Received Date', 'Start Date', 'End Date', 'Image', 'Data'],
      dtype='object')

In [52]:
col_mapping = {'Committee Name' : 'committee', 
               'Report Year' : 'report_year',
               'Report Type' : 'report_type',
               'Amend' : 'amend',
               'Received Date' : 'rec_date',
               'Start Date' : 'start_date',
               'End Date' : 'end_date',
               'Image' : 'image_link',
               'Data' : 'data_link',
               }

In [53]:
df2014.columns = [col_mapping[c] for c in df2015.columns]

In [54]:
df2015.head()


Out[54]:
committee report_year report_type amend rec_date start_date end_date image_link data_link
0 01ST CONG DIST DEC 2015 Year End Semi-Annual N 01/26/2016 07/01/2015 12/31/2015 http://cf.ncsbe.gov/CFOrgLkup/cf_report_image....
1 01ST CONG DIST DEC 2015 Mid Year Semi-Annual Y 08/07/2015 01/01/2015 06/30/2015 http://cf.ncsbe.gov/CFOrgLkup/cf_report_image.... http://cf.ncsbe.gov/CFOrgLkup/cf_report_sectio...
2 01ST CONG DIST DEC 2015 Mid Year Semi-Annual N 07/29/2015 01/01/2015 06/30/2015 http://cf.ncsbe.gov/CFOrgLkup/cf_report_image....
3 01ST CONG DIST REC 2015 Year End Semi-Annual N 01/25/2016 07/01/2015 12/31/2015 http://cf.ncsbe.gov/CFOrgLkup/cf_report_image....
4 01ST CONG DIST REC 2015 Mid Year Semi-Annual N 07/16/2015 01/01/2015 06/30/2015 http://cf.ncsbe.gov/CFOrgLkup/cf_report_image.... http://cf.ncsbe.gov/CFOrgLkup/cf_report_sectio...

All Data


In [96]:
import init_NCCampaignDocument_table

In [139]:
reload(init_NCCampaignDocument_table)


Out[139]:
<module 'init_NCCampaignDocument_table' from '/home/immersinn/gits/ncga/src/init_NCCampaignDocument_table.py'>

In [143]:
df2014 = init_NCCampaignDocument_table.lae_nccd('2014')

In [144]:
df2014.shape


Out[144]:
(13611, 9)

In [145]:
df2014.head()


Out[145]:
committee report_year report_type amend rec_date start_date end_date image_link data_link
0 01ST CONG DIST DEC 2014 Fourth Quarter 0 2015-01-06 2014-10-19 2014-12-31 http://cf.ncsbe.gov/CFOrgLkup/cf_report_image.... http://cf.ncsbe.gov/CFOrgLkup/cf_report_sectio...
1 01ST CONG DIST DEC 2014 Third Quarter 1 2015-01-06 2014-07-01 2014-10-18 http://cf.ncsbe.gov/CFOrgLkup/cf_report_image.... http://cf.ncsbe.gov/CFOrgLkup/cf_report_sectio...
2 01ST CONG DIST DEC 2014 Paid Penalty Assessment or Forfeiture 0 2014-11-07 http://cf.ncsbe.gov/CFOrgLkup/cf_report_image....
3 01ST CONG DIST DEC 2014 Penalty Assessment Letter 0 2014-10-30 http://cf.ncsbe.gov/CFOrgLkup/cf_report_image....
4 01ST CONG DIST DEC 2014 Third Quarter 0 2014-10-27 2014-07-01 2014-10-18 http://cf.ncsbe.gov/CFOrgLkup/cf_report_image....

In [146]:
df2014.tail()


Out[146]:
committee report_year report_type amend rec_date start_date end_date image_link data_link
13606 YOUNTS FOR NC HOUSE 2014 48-Hour 0 2014-04-30 http://cf.ncsbe.gov/CFOrgLkup/cf_report_image....
13607 YOUNTS FOR NC HOUSE 2014 48-Hour 0 2014-04-28 http://cf.ncsbe.gov/CFOrgLkup/cf_report_image....
13608 ZENECA INC PAC 2014 Fourth Quarter 0 2015-01-12 2014-10-19 2014-12-31 http://cf.ncsbe.gov/CFOrgLkup/cf_report_image....
13609 ZENECA INC PAC 2014 Third Quarter 0 2014-10-23 2014-07-01 2014-10-18 http://cf.ncsbe.gov/CFOrgLkup/cf_report_image....
13610 ZENECA INC PAC 2014 Second Quarter 0 2014-07-09 2014-04-20 2014-06-30 http://cf.ncsbe.gov/CFOrgLkup/cf_report_image....

In [147]:
df2014.committee.apply(len).describe()


Out[147]:
count    13611.000000
mean        24.992359
std         10.785938
min          5.000000
25%         17.500000
50%         25.000000
75%         31.000000
max         89.000000
Name: committee, dtype: float64

In [149]:
df2014.image_link.apply(len).describe()


Out[149]:
count    13611.000000
mean        60.923812
std          2.154536
min          0.000000
25%         61.000000
50%         61.000000
75%         61.000000
max         61.000000
Name: image_link, dtype: float64

In [150]:
df2014.data_link.apply(len).describe()


Out[150]:
count    13611.000000
mean        46.788774
std         68.847038
min          0.000000
25%          0.000000
50%          0.000000
75%        134.000000
max        241.000000
Name: data_link, dtype: float64

In [151]:
df2014.rec_date.describe()


Out[151]:
count          13611
unique           520
top       2014-04-28
freq             538
Name: rec_date, dtype: object

Names of all Report Types

From Selection Box


In [3]:
with open('/home/immersinn/gits/ncga/references/report_type_names.txt', 'r') as f:
    rtn = f.readlines()

In [4]:
def process_rtn_entry(ent):
    ent = ent.strip('\n')
    ent = ent.split(' - ')[1]
    ent = ent.split(' (')[0]
    return(ent)

In [5]:
rtns = list(set([process_rtn_entry(r) for r in rtn]))
rtns = sorted(rtns)
rtns


Out[5]:
['24-Hour Electioneering Communications',
 '48-Hour',
 'Annual',
 'Audit Letter',
 'Campaign Reporter',
 'Candidate Designation of Committee Funds',
 'Candidate Specific Communications',
 'Certification of Inactive Status',
 'Certification of Incorporated Political Committee',
 'Certification of Return to Active Status',
 'Certification of Threshold',
 'Certification of Treasurer',
 'Certification to Close Committee',
 'Contribution from a Business Account Statement',
 'Correspondence Returned Undeliverable or Unclaimed',
 'Declaration of Intent',
 'Deferred Notice',
 'District Attorney Letter',
 'Electioneering Communications Report',
 'Federal Mid Year',
 'Federal Year End',
 'Final',
 'First Quarter',
 'Forgiven Loan Statement',
 'Fourth Quarter',
 'Independent Expenditure Political Committee Certification',
 'Independent Expenditure Report',
 'Independent Expenditure for Registered Committees',
 'Independent Expenditure for non-Committees',
 'Interim',
 'Judicial Qualifying Contributions Report',
 'Loan Proceeds Statement',
 'Mid Year Semi-Annual',
 'Miscellaneous Correspondence',
 'Monthly',
 'Municipal Voter-Owned Election Qualifying Contributions',
 'Non-Compliance Letter',
 'Non-Participating Candidate',
 'Notice of Candidacy',
 'Notice of Termination of Active Status',
 'Notification of Change to Reporting Schedule',
 'Organizational',
 'Other',
 'Paid Penalty Assessment or Forfeiture',
 'Penalty Appeal',
 'Penalty Appeal Decision',
 'Penalty Assessment',
 'Penalty Assessment Letter',
 'Penalty Resolution Agreement Executed',
 'Penalty Resolution Agreement Proposal',
 'Penalty Waiver Letter',
 'Political Party Executive Committee Exempt Sales Plan',
 'Post General',
 'Post Primary',
 'Pre-Election',
 'Pre-Primary',
 'Pre-Referendum',
 'Pre-Runoff',
 'Rescind Letter',
 'Second Quarter',
 'Signed Penalty Waiver Agreement',
 'Special',
 'Statement of Organization',
 'Supplemental Final',
 'Ten-day',
 'Third Quarter',
 'Thirty-day',
 'Thirty-five-day',
 'Twelve-day',
 'Voter-Owned Election Qualifying Contributions',
 'Weekly',
 'Year End Semi-Annual']

In [9]:
','.join(["'" + r + "'" for r in rtns])


Out[9]:
"'24-Hour Electioneering Communications','48-Hour','Annual','Audit Letter','Campaign Reporter','Candidate Designation of Committee Funds','Candidate Specific Communications','Certification of Inactive Status','Certification of Incorporated Political Committee','Certification of Return to Active Status','Certification of Threshold','Certification of Treasurer','Certification to Close Committee','Contribution from a Business Account Statement','Correspondence Returned Undeliverable or Unclaimed','Declaration of Intent','Deferred Notice','District Attorney Letter','Electioneering Communications Report','Federal Mid Year','Federal Year End','Final','First Quarter','Forgiven Loan Statement','Fourth Quarter','Independent Expenditure Political Committee Certification','Independent Expenditure Report','Independent Expenditure for Registered Committees','Independent Expenditure for non-Committees','Interim','Judicial Qualifying Contributions Report','Loan Proceeds Statement','Mid Year Semi-Annual','Miscellaneous Correspondence','Monthly','Municipal Voter-Owned Election Qualifying Contributions','Non-Compliance Letter','Non-Participating Candidate','Notice of Candidacy','Notice of Termination of Active Status','Notification of Change to Reporting Schedule','Organizational','Other','Paid Penalty Assessment or Forfeiture','Penalty Appeal','Penalty Appeal Decision','Penalty Assessment','Penalty Assessment Letter','Penalty Resolution Agreement Executed','Penalty Resolution Agreement Proposal','Penalty Waiver Letter','Political Party Executive Committee Exempt Sales Plan','Post General','Post Primary','Pre-Election','Pre-Primary','Pre-Referendum','Pre-Runoff','Rescind Letter','Second Quarter','Signed Penalty Waiver Agreement','Special','Statement of Organization','Supplemental Final','Ten-day','Third Quarter','Thirty-day','Thirty-five-day','Twelve-day','Voter-Owned Election Qualifying Contributions','Weekly','Year End Semi-Annual'"

From Data


In [164]:
sorted(dfall.report_type.unique())


Out[164]:
['',
 '24-Hour Electioneering Communications',
 '48-Hour',
 'Annual',
 'Audit Letter',
 'Candidate Designation of Committee Funds',
 'Certification of Inactive Status',
 'Certification of Incorporated Political Committee',
 'Certification of Return to Active Status',
 'Certification of Threshold',
 'Certification of Treasurer',
 'Certification to Close Committee',
 'Contribution from a Business Account Statement',
 'Correspondence Returned Undeliverable or Unclaimed',
 'Deferred Notice',
 'District Attorney Letter',
 'Electioneering Communications Report',
 'Final',
 'First Quarter',
 'Forgiven Loan Statement',
 'Fourth Quarter',
 'Independent Expenditure Report',
 'Independent Expenditure for Registered Committees',
 'Independent Expenditure for non-Committees',
 'Loan Proceeds Statement',
 'Mid Year Semi-Annual',
 'Miscellaneous Correspondence',
 'Non-Compliance Letter',
 'Notice of Candidacy',
 'Notice of Termination of Active Status',
 'Organizational',
 'Paid Penalty Assessment or Forfeiture',
 'Penalty Appeal',
 'Penalty Appeal Decision',
 'Penalty Assessment - 30 Days Aged',
 'Penalty Assessment Letter',
 'Penalty Resolution Agreement Executed',
 'Penalty Waiver Letter',
 'Political Party Executive Committee Exempt Sales Plan',
 'Pre-Election',
 'Pre-Primary',
 'Pre-Referendum',
 'Pre-Runoff',
 'Second Quarter',
 'Special',
 'Supplemental Final',
 'Ten-day',
 'Third Quarter',
 'Thirty-five-day',
 'Year End Semi-Annual']